#!/bin/bash
read log_file MODEL_NAME  < <(bash setup_env.sh) 
echo "log file: $log_file" "$MODEL_NAME"
# ----------------------------------------------------------------------------------------
export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15'

model_name_or_path=/mnt/share1/models/baichuan-inc/Baichuan2-7B-Base
template=baichuan2  ###!!!!!!!!!!
NPROC_PER_NODE=16
NNODES=1
RANK=0 #主机为0 ，其它机器为1
MASTER_ADDR=127.0.0.1
MASTER_PORT=14788


batch=4
python3 -m torch.distributed.run \
    --nproc_per_node $NPROC_PER_NODE \
    --nnodes $NNODES \
    --node_rank $RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT \
    ../../src/train.py \
    --stage sft \
    --deepspeed ../../examples/deepspeed/ds_z3_config.json \
    --model_name_or_path $model_name_or_path \
    --do_train \
    --dataset alpaca_gpt4_en \
    --dataset_dir ../../data \
    --template $template \
    --finetuning_type full \
    --output_dir ./outputllm_${MODEL_NAME} \
    --overwrite_cache \
    --overwrite_output_dir \
    --cutoff_len 2048 \
    --preprocessing_num_workers 16 \
    --per_device_train_batch_size $batch \
    --per_device_eval_batch_size $batch \
    --gradient_accumulation_steps 2 \
    --lr_scheduler_type cosine \
    --logging_steps 1 \
    --warmup_steps 20 \
    --save_steps 40000 \
    --learning_rate 5e-5 \
    --num_train_epochs 3.0 \
    --max_samples 50000000 \
    --plot_loss \
    --print_param_status \
    --flash_attn auto \
    --bf16 \
    2>&1 | tee -a $log_file

    # --flash_attn \
        # --flash_attn fa2\
    # -- disable_gradient_checkpointing

echo "log saved: $log_file"
echo "------EOF" | tee -a $log_file
